The purpose of this file is to practice applying basic R commands learned through lecture videos and perform exploratory analysis on worldwide COVID-19 data.
library(dplyr)
library(lubridate)
library(knitr)
library(ggplot2)
library(cowplot)Data was read in and summarized as shown below: (current file is as of 5/31)
owid <- read.csv(url("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"), na.strings = c("", "NA"), colClasses = c("date" = "Date"), header = TRUE)
dim(owid)## [1] 94441 59
head(owid)
str(owid)
summary(owid)Data set, including possible confounders for use in later analysis, is cleaned and filtered below.
owid.filtered <- owid %>%
rename(iso.code = iso_code, totcases = total_cases, totdeaths = total_deaths, totcases.mil = total_cases_per_million, totdeaths.mil = total_deaths_per_million, pop.density = population_density, med.age = median_age, older70 = aged_70_older, gdp.cap = gdp_per_capita, card.death = cardiovasc_death_rate, diab.prev = diabetes_prevalence, fem.smokers = female_smokers, male.smokers = male_smokers) %>%
select(c(iso.code:totcases, totdeaths, totcases.mil, totdeaths.mil, population, pop.density, med.age, older70, gdp.cap, card.death, diab.prev, fem.smokers, male.smokers)) %>% #Renamed variables and filtered to keep only those of interest.
filter(date == "2020-05-31") #Filtered out all but most recent date (most recent date is a running sum of total cases).
table(owid.filtered$continent, useNA = "ifany") #Data includes both continent and country-specific outcomes. Continent outcomes can be distinguished by "NA" in the continent column; the continent name is in the location column. ##
## Africa Asia Europe North America Oceania
## 54 47 46 23 4
## South America <NA>
## 12 9
More exploratory analysis of data frames:
dim(owid.filtered)
head(owid.filtered)
tail(owid.filtered)Data is filtered to remove entries for specific country locations and include only data that aggregates COVID-19 outcomes on a continent and world scale.
owid.continents <- owid.filtered %>%
filter(is.na(continent)) %>%
select(-continent) #Used missingness properties to select only continent rows and then removed continent column.
table(owid.continents$location)##
## Africa Asia Europe European Union International
## 1 1 1 1 1
## North America Oceania South America World
## 1 1 1 1
Data is filtered to include only country-specific COVID-19 outcomes and exclude continent and world-wide data.
owid.countries <- owid.filtered %>%
filter(!is.na(continent)) #Removed all rows with NA for continent, leaving behind only countries.
table(owid.countries$continent)##
## Africa Asia Europe North America Oceania
## 54 47 46 23 4
## South America
## 12
table(owid.filtered$totcases, useNA = "ifany") %>%
tail() #No missing values for total cases column. ##
## 1138778 1160536 1798793 1959812 2031973 6188259
## 1 1 1 1 1 1
table(owid.filtered$totdeaths, useNA = "ifany") %>%
tail() #Several missing values for total deaths column. Keep this in mind when analyzing. ##
## 107861 126360 127104 172683 391267 <NA>
## 1 1 1 1 1 19
which(is.na(owid.filtered$totdeaths))## [1] 22 31 51 57 63 72 98 101 118 123 138 148 149 150 156 177 182 189 191
#Missing countries: Butan, Cambodia, Dominica, Eritrea, Fiji, Grenada, Laos, Lesotho, Mongolia, Namibia, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Seychelles, Timor, Uganda, Vatican, Vietnam.
#Missingness for confounders:
owid.countries[!complete.cases(owid.countries), ] #Most missingness in smoking categories. ## iso.code continent location date totcases
## 1 AFG Asia Afghanistan 2020-05-31 15208
## 4 AND Europe Andorra 2020-05-31 764
## 5 AGO Africa Angola 2020-05-31 86
## 6 ATG North America Antigua and Barbuda 2020-05-31 26
## 18 BLZ North America Belize 2020-05-31 18
## 20 BTN Asia Bhutan 2020-05-31 43
## 21 BOL South America Bolivia 2020-05-31 9982
## 28 BDI Africa Burundi 2020-05-31 63
## 29 KHM Asia Cambodia 2020-05-31 125
## 30 CMR Africa Cameroon 2020-05-31 5904
## 33 CAF Africa Central African Republic 2020-05-31 1011
## 34 TCD Africa Chad 2020-05-31 778
## 41 CIV Africa Cote d'Ivoire 2020-05-31 2833
## 43 CUB North America Cuba 2020-05-31 2045
## 46 COD Africa Democratic Republic of Congo 2020-05-31 3070
## 49 DMA North America Dominica 2020-05-31 16
## 54 GNQ Africa Equatorial Guinea 2020-05-31 1306
## 55 ERI Africa Eritrea 2020-05-31 39
## 59 FJI Oceania Fiji 2020-05-31 18
## 62 GAB Africa Gabon 2020-05-31 2655
## 68 GRD North America Grenada 2020-05-31 23
## 69 GTM North America Guatemala 2020-05-31 5087
## 70 GIN Africa Guinea 2020-05-31 3706
## 71 GNB Africa Guinea-Bissau 2020-05-31 1256
## 72 GUY South America Guyana 2020-05-31 153
## 74 HND North America Honduras 2020-05-31 5202
## 75 HKG Asia Hong Kong 2020-05-31 1084
## 81 IRQ Asia Iraq 2020-05-31 6439
## 87 JOR Asia Jordan 2020-05-31 739
## 90 OWID_KOS Europe Kosovo 2020-05-31 1070
## 93 LAO Asia Laos 2020-05-31 19
## 96 LSO Africa Lesotho 2020-05-31 2
## 98 LBY Africa Libya 2020-05-31 156
## 99 LIE Europe Liechtenstein 2020-05-31 82
## 102 MDG Africa Madagascar 2020-05-31 771
## 108 MRT Africa Mauritania 2020-05-31 530
## 112 MCO Europe Monaco 2020-05-31 99
## 113 MNG Asia Mongolia 2020-05-31 179
## 118 NAM Africa Namibia 2020-05-31 24
## 122 NIC North America Nicaragua 2020-05-31 759
## 125 MKD Europe North Macedonia 2020-05-31 2226
## 129 PSE Asia Palestine 2020-05-31 448
## 131 PNG Oceania Papua New Guinea 2020-05-31 8
## 133 PER South America Peru 2020-05-31 164476
## 141 KNA North America Saint Kitts and Nevis 2020-05-31 15
## 142 LCA North America Saint Lucia 2020-05-31 18
## 143 VCT North America Saint Vincent and the Grenadines 2020-05-31 26
## 144 SMR Europe San Marino 2020-05-31 671
## 145 STP Africa Sao Tome and Principe 2020-05-31 483
## 148 SRB Europe Serbia 2020-05-31 11412
## 149 SYC Africa Seychelles 2020-05-31 11
## 154 SOM Africa Somalia 2020-05-31 1976
## 157 SSD Africa South Sudan 2020-05-31 994
## 160 SDN Africa Sudan 2020-05-31 4800
## 164 SYR Asia Syria 2020-05-31 122
## 165 TWN Asia Taiwan 2020-05-31 442
## 166 TJK Asia Tajikistan 2020-05-31 3930
## 169 TLS Asia Timor 2020-05-31 24
## 171 TTO North America Trinidad and Tobago 2020-05-31 117
## 174 UGA Africa Uganda 2020-05-31 417
## 181 VAT Europe Vatican 2020-05-31 12
## 182 VEN South America Venezuela 2020-05-31 1510
## 183 VNM Asia Vietnam 2020-05-31 328
## totdeaths totcases.mil totdeaths.mil population pop.density med.age older70
## 1 258 390.667 6.628 38928341 54.422 18.6 1.337
## 4 51 9888.048 660.066 77265 163.755 NA NA
## 5 4 2.617 0.122 32866268 23.890 16.8 1.362
## 6 3 265.501 30.635 97928 231.845 32.1 4.631
## 18 2 45.269 5.030 397621 16.426 25.0 2.279
## 20 NA 55.727 NA 771612 21.188 28.6 2.977
## 21 313 855.134 26.814 11673029 10.202 25.4 4.393
## 28 1 5.298 0.084 11890781 423.062 17.5 1.504
## 29 NA 7.477 NA 16718971 90.672 25.6 2.385
## 30 191 222.408 7.195 26545864 50.885 18.8 1.919
## 33 2 209.327 0.414 4829764 7.479 18.3 2.251
## 34 65 47.364 3.957 16425859 11.833 16.7 1.446
## 41 33 107.399 1.251 26378275 76.399 18.7 1.582
## 43 83 180.548 7.328 11326616 110.408 43.1 9.719
## 46 72 34.278 0.804 89561404 35.879 17.0 1.745
## 49 NA 222.250 NA 71991 98.567 NA NA
## 54 12 930.872 8.553 1402985 45.194 22.4 1.752
## 55 NA 10.997 NA 3546427 44.304 19.3 2.171
## 59 NA 20.079 NA 896444 49.562 28.6 3.284
## 62 17 1192.868 7.638 2225728 7.859 23.1 2.976
## 68 NA 204.410 NA 112519 317.132 29.4 5.021
## 69 108 283.943 6.028 17915567 157.834 22.9 3.016
## 70 23 282.194 1.751 13132792 51.755 19.0 1.733
## 71 8 638.212 4.065 1967998 66.191 19.4 1.565
## 72 12 194.518 15.256 786559 3.952 26.3 2.837
## 74 212 525.210 21.404 9904608 82.805 24.9 2.883
## 75 4 144.591 0.534 7496988 7039.714 44.8 10.158
## 81 205 160.085 5.097 40222503 88.125 20.0 1.957
## 87 9 72.429 0.882 10203140 109.285 23.2 2.361
## 90 30 553.608 15.522 1932774 168.155 NA NA
## 93 NA 2.611 NA 7275556 29.715 24.4 2.322
## 96 NA 0.934 NA 2142252 73.562 22.2 2.647
## 98 5 22.703 0.728 6871287 3.623 29.0 2.816
## 99 1 2150.143 26.221 38137 237.012 NA NA
## 102 6 27.843 0.217 27691019 43.951 19.6 1.686
## 108 23 113.987 4.947 4649660 4.289 20.3 1.792
## 112 4 2522.679 101.926 39244 19347.500 NA NA
## 113 NA 54.602 NA 3278292 1.980 28.6 2.421
## 118 NA 9.445 NA 2540916 3.078 22.0 2.085
## 122 35 114.574 5.283 6624554 51.667 27.3 3.519
## 125 133 1068.456 63.839 2083380 82.600 39.1 8.160
## 129 3 87.819 0.588 5101416 778.202 20.4 1.726
## 131 NA 0.894 NA 8947027 18.220 22.6 2.142
## 133 20710 4988.377 628.112 32971846 25.129 29.1 4.455
## 141 NA 281.997 NA 53192 212.865 NA NA
## 142 NA 98.024 NA 183629 293.187 34.9 6.405
## 143 NA 234.346 NA 110947 281.787 31.8 4.832
## 144 42 19771.348 1237.551 33938 556.667 NA NA
## 145 12 2203.859 54.754 219161 212.841 18.7 2.162
## 148 243 1677.102 35.711 6804596 80.291 41.2 NA
## 149 NA 111.857 NA 98340 208.354 36.2 5.586
## 154 78 124.330 4.908 15893219 23.500 16.8 1.496
## 157 10 88.800 0.893 11193729 NA 19.2 2.032
## 160 286 109.466 6.522 43849269 23.258 19.7 2.034
## 164 5 6.971 0.286 17500657 NA 21.7 2.577
## 165 7 18.558 0.294 23816775 NA 42.2 8.353
## 166 47 412.052 4.928 9537642 64.281 23.3 2.155
## 169 NA 18.203 NA 1318442 87.176 18.0 1.897
## 171 8 83.602 5.716 1399491 266.886 36.2 5.819
## 174 NA 9.117 NA 45741000 213.759 16.4 1.308
## 181 NA 14833.127 NA 809 NA NA NA
## 182 14 53.102 0.492 28435943 36.253 29.0 3.915
## 183 NA 3.370 NA 97338583 308.127 32.6 4.718
## gdp.cap card.death diab.prev fem.smokers male.smokers
## 1 1803.987 597.029 9.59 NA NA
## 4 NA 109.135 7.97 29.0 37.8
## 5 5819.495 276.045 3.94 NA NA
## 6 21490.943 191.511 13.17 NA NA
## 18 7824.362 176.957 17.11 NA NA
## 20 8708.597 217.066 9.75 NA NA
## 21 6885.829 204.299 6.89 NA NA
## 28 702.225 293.068 6.05 NA NA
## 29 3645.070 270.892 4.00 2.0 33.7
## 30 3364.926 244.661 7.20 NA NA
## 33 661.240 435.727 6.10 NA NA
## 34 1768.153 280.995 6.10 NA NA
## 41 3601.006 303.740 2.42 NA NA
## 43 NA 190.968 8.27 17.1 53.3
## 46 808.133 318.949 6.10 NA NA
## 49 9673.367 227.376 11.62 NA NA
## 54 22604.873 202.812 7.78 NA NA
## 55 1510.459 311.110 6.05 0.2 11.4
## 59 8702.975 412.820 14.49 10.2 34.8
## 62 16562.413 259.967 7.20 NA NA
## 68 13593.877 243.964 10.71 NA NA
## 69 7423.808 155.898 10.18 NA NA
## 70 1998.926 336.717 2.42 NA NA
## 71 1548.675 382.474 2.42 NA NA
## 72 7435.047 373.159 11.62 NA NA
## 74 4541.795 240.208 7.21 2.0 NA
## 75 56054.920 NA 8.33 NA NA
## 81 15663.986 218.612 8.83 NA NA
## 87 8337.490 208.257 11.75 NA NA
## 90 9795.834 NA NA NA NA
## 93 6397.360 368.111 4.00 7.3 51.2
## 96 2851.153 405.126 3.94 0.4 53.9
## 98 17881.509 341.862 10.43 NA NA
## 99 NA NA 7.77 NA NA
## 102 1416.440 405.994 3.94 NA NA
## 108 3597.633 232.347 2.42 NA NA
## 112 NA NA 5.46 NA NA
## 113 11840.846 460.043 4.82 5.5 46.5
## 118 9541.808 243.811 3.94 9.7 34.2
## 122 5321.444 137.016 11.47 NA NA
## 125 13111.214 322.688 10.08 NA NA
## 129 4449.898 265.910 10.59 NA NA
## 131 3823.194 561.494 17.65 23.5 48.8
## 133 12236.706 85.755 5.95 4.8 NA
## 141 24654.385 NA 12.84 NA NA
## 142 12951.839 204.620 11.62 NA NA
## 143 10727.146 252.675 11.62 NA NA
## 144 56861.470 NA 5.64 NA NA
## 145 3052.714 270.113 2.42 NA NA
## 148 14048.881 439.415 10.08 37.7 40.2
## 149 26382.287 242.648 10.55 7.1 35.7
## 154 NA 365.769 6.05 NA NA
## 157 1569.888 280.775 10.43 NA NA
## 160 4466.507 431.388 15.67 NA NA
## 164 NA 376.264 NA NA NA
## 165 NA 103.957 NA NA NA
## 166 2896.913 427.698 7.11 NA NA
## 169 6570.102 335.346 6.86 6.3 78.1
## 171 28763.071 228.467 10.97 NA NA
## 174 1697.707 213.333 2.50 3.4 16.7
## 181 NA NA NA NA NA
## 182 16745.022 204.850 6.47 NA NA
## 183 6171.884 245.465 6.00 1.0 45.9
Filtered OWID data frames further narrowed to COVID-19 cases only, deaths only, and hospitalizations only, separated by continent and by country respectively.
#By continent:
select(owid.continents, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by continent.
select(owid.continents, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by continent.
select(owid.continents, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by continent.
#By country:
select(owid.countries, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by country.
select(owid.countries, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by country.
select(owid.countries, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by country. Plots for exploratory analysis comparing COVID-19 cases by continent for total cases, total cases per million people, total deaths, and total deaths per million people.
continents1 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totcases)) +
geom_bar(stat="identity") +
geom_text(aes(label = totcases), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Cases by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases by continent.
continents2 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totcases.mil)) +
geom_bar(stat="identity") +
geom_text(aes(label = totcases.mil), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Cases per Million People by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases per Million") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases per million by continent.
plot_grid(continents1, continents2, labels = "AUTO")continents3 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totdeaths)) +
geom_bar(stat="identity") +
geom_text(aes(label = totdeaths), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Deaths by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths by continent.
continents4 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totdeaths.mil)) +
geom_bar(stat="identity") +
geom_text(aes(label = totdeaths.mil), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Deaths per Million by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths per million by continent.
plot_grid(continents3, continents4, labels = "AUTO")Barplots filtering out the 10 countries with the highest COVID-19 cases per million people on each continent, as well as a heatmap of all countries, are the outputs of the code below:
ctry.mean.cases <- owid.countries %>%
group_by(continent) %>%
summarise(totcases.mil = mean(totcases.mil, na.rm = TRUE)) #Mean COVID-19 cases per country, grouped by continent.
mean.cases <- function(x) {
ctry.mean.cases %>%
filter(continent == x) %>%
select(totcases.mil) %>%
as.numeric()
} #Function to produce numeric value of mean COVID-19 cases per country by continent to be used in bar graphs below.
top.ctry.cases <- function(x) {
owid.countries %>%
filter(continent == x) %>%
arrange(desc(totcases.mil)) %>%
head(n=10) %>%
ggplot(aes(x = location, y = totcases.mil)) +
geom_bar(stat = "identity") +
geom_abline(slope=0, intercept= mean.cases(x), col = "red", lty=2) +
labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(top.ctry.cases(i))
} #Loops function for continents of interest. ctry.mean.deaths <- owid.countries %>%
group_by(continent) %>%
summarise(totdeaths.mil = mean(totdeaths.mil, na.rm = TRUE)) #Mean COVID-19 daeths per country, grouped by continent.
mean.deaths <- function(x) {
ctry.mean.deaths %>%
filter(continent == x) %>%
select(totdeaths.mil) %>%
as.numeric()
} #Function to produce numeric value of mean COVID-19 deaths per country by continent to be used in bar graphs below.
top.ctry.deaths <- function(x) {
owid.countries %>%
filter(continent == x) %>%
arrange(desc(totdeaths.mil)) %>%
head(n=10) %>%
ggplot(aes(x = location, y = totdeaths.mil)) +
geom_bar(stat = "identity") +
geom_abline(slope=0, intercept= mean.deaths(x), col = "red", lty=2) +
labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(top.ctry.deaths(i))
} #Loops function for continents of interest. owid %>%
rename(iso.code = iso_code, totcases.mil = total_cases_per_million) %>%
select(c(iso.code:date, totcases.mil)) %>%
filter(!is.na(continent)) %>%
ggplot(aes(x = date, y = location, fill = totcases.mil)) +
geom_tile() +
labs(title = "Total COVID-19 Cases per Million Over Time", size = 22) +
scale_x_date(date_breaks = "1 month") +
scale_fill_gradient(low = "#ffeda0", high = "#f03b20") +
ggsave(file = "test.image.pdf", width = 20, height = 25) #Total cases per million over time by country as heatmap. owid %>%
rename(iso.code = iso_code, new.cases = new_cases) %>%
select(c(iso.code:date, new.cases)) %>%
filter(!is.na(continent)) %>%
ggplot(aes(x = date, y = location, fill = new.cases)) +
geom_tile() +
labs(title = "New COVID-19 Cases per Million Over Time", size = 22) +
scale_x_date(date_breaks = "1 month") +
scale_fill_gradient(low = "#ffeda0", high = "#f03b20", limits = c(0, 20000)) +
ggsave(file = "test.image2.pdf", width = 20, height = 25) #New cases per million over time by country as heatmap. Difficult to scale because range is so large. Population size and population density are evaluated below as confounding variables for COVID-19 cases. These variables are first analyzed on a full-world scale, and then by continent.
ggplot(data = owid.countries, aes(x = population, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
labs(title = ("World COVID-19 Cases per Million People vs. Population Size")) #COVID-19 cases and population size on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totcases.mil, owid.countries$population, use = "complete.obs") ## [1] -0.05455269
pop.lm.cases <- lm(totcases.mil ~ population, data = owid.countries)
pop.lm.cases##
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
##
## Coefficients:
## (Intercept) population
## 1.378e+03 -9.934e-07
summary.lm(pop.lm.cases) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1375.3 -1273.2 -1085.4 170.6 18393.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.378e+03 2.092e+02 6.588 4.54e-10 ***
## population -9.934e-07 1.340e-06 -0.741 0.46
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2750 on 184 degrees of freedom
## Multiple R-squared: 0.002976, Adjusted R-squared: -0.002443
## F-statistic: 0.5492 on 1 and 184 DF, p-value: 0.4596
ggplot(data = owid.countries, aes(x = pop.density, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
labs(title = ("World COVID-19 Cases per Million People vs. Population Density")) #COVID-19 cases and population size on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totcases.mil, owid.countries$pop.density, use = "complete.obs") ## [1] 0.09592666
pop.den.lm.cases <- lm(totcases.mil ~ pop.density, data = owid.countries)
pop.den.lm.cases##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
##
## Coefficients:
## (Intercept) pop.density
## 1231.377 0.152
summary.lm(pop.den.lm.cases) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2157.1 -1177.8 -994.2 130.6 18487.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1231.3772 195.4280 6.301 2.21e-09 ***
## pop.density 0.1520 0.1176 1.293 0.198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2578 on 180 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.009202, Adjusted R-squared: 0.003697
## F-statistic: 1.672 on 1 and 180 DF, p-value: 0.1977
cases.pop <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = population, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Size"))
}
pop.lm.cases.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totcases.mil ~ population,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.pop(i))
print(pop.lm.cases.cont(i))
} #Loops function for continents of interest. ##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1497.85 -345.09 -175.71 -90.24 2662.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.970e+02 1.820e+02 2.182 0.0406 *
## population 1.399e-05 2.434e-06 5.749 1.05e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 820.1 on 21 degrees of freedom
## Multiple R-squared: 0.6115, Adjusted R-squared: 0.593
## F-statistic: 33.05 on 1 and 21 DF, p-value: 1.047e-05
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2695.4 -2216.3 -1191.6 828.5 16764.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.007e+03 6.407e+02 4.694 2.63e-05 ***
## population -6.117e-06 1.972e-05 -0.310 0.758
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3758 on 44 degrees of freedom
## Multiple R-squared: 0.002183, Adjusted R-squared: -0.0205
## F-statistic: 0.09624 on 1 and 44 DF, p-value: 0.7579
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1496.4 -1408.7 -1201.6 507.7 18248.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.509e+03 4.997e+02 3.020 0.00415 **
## population -1.384e-06 1.671e-06 -0.828 0.41190
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3236 on 45 degrees of freedom
## Multiple R-squared: 0.01502, Adjusted R-squared: -0.006873
## F-statistic: 0.686 on 1 and 45 DF, p-value: 0.4119
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -321.11 -232.55 -199.00 -14.54 3069.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.283e+02 9.379e+01 3.500 0.000964 ***
## population -2.899e-06 2.168e-06 -1.337 0.187074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 564.8 on 52 degrees of freedom
## Multiple R-squared: 0.03323, Adjusted R-squared: 0.01464
## F-statistic: 1.787 on 1 and 52 DF, p-value: 0.1871
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## 1 2 3 4
## 16.16 -67.22 196.03 -144.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.078e+01 1.313e+02 0.615 0.601
## population 7.275e-06 9.564e-06 0.761 0.526
##
## Residual standard error: 179.2 on 2 degrees of freedom
## Multiple R-squared: 0.2243, Adjusted R-squared: -0.1635
## F-statistic: 0.5785 on 1 and 2 DF, p-value: 0.5263
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1446.1 -1227.4 -1084.0 11.2 5023.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.319e+03 7.569e+02 1.742 0.112
## population 6.354e-06 1.143e-05 0.556 0.591
##
## Residual standard error: 2204 on 10 degrees of freedom
## Multiple R-squared: 0.02996, Adjusted R-squared: -0.06705
## F-statistic: 0.3088 on 1 and 10 DF, p-value: 0.5906
cases.pop.den <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = pop.density, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Density"))
}
pop.den.lm.cases.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totcases.mil ~ pop.density,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.pop.den(i))
print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest. ##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1199.9 -668.7 -356.4 -7.5 4244.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1292.670 405.934 3.184 0.00446 **
## pop.density -2.893 1.689 -1.713 0.10149
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1232 on 21 degrees of freedom
## Multiple R-squared: 0.1226, Adjusted R-squared: 0.08078
## F-statistic: 2.933 on 1 and 21 DF, p-value: 0.1015
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2356.3 -2006.6 -957.2 929.5 17129.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.633e+03 5.072e+02 5.191 5.4e-06 ***
## pop.density 1.617e-02 1.751e-01 0.092 0.927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3332 on 43 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0001984, Adjusted R-squared: -0.02305
## F-statistic: 0.008531 on 1 and 43 DF, p-value: 0.9268
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3520.3 -1236.9 -1147.3 556.9 18445.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1229.2082 523.4504 2.348 0.0235 *
## pop.density 0.3460 0.3177 1.089 0.2822
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3277 on 43 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.02684, Adjusted R-squared: 0.004206
## F-statistic: 1.186 on 1 and 43 DF, p-value: 0.2822
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -266.99 -236.07 -181.56 -44.54 3127.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 272.6243 102.1690 2.668 0.0102 *
## pop.density -0.1263 0.6192 -0.204 0.8392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 579.3 on 51 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.000815, Adjusted R-squared: -0.01878
## F-statistic: 0.0416 on 1 and 51 DF, p-value: 0.8392
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## 1 2 3 4
## 26.13 12.58 136.11 -174.81
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 273.491 130.823 2.091 0.172
## pop.density -5.367 4.677 -1.148 0.370
##
## Residual standard error: 158 on 2 degrees of freedom
## Multiple R-squared: 0.397, Adjusted R-squared: 0.09555
## F-statistic: 1.317 on 1 and 2 DF, p-value: 0.3699
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1791.1 -1201.3 -903.1 -36.4 4919.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 934.07 1096.78 0.852 0.414
## pop.density 25.10 36.77 0.683 0.510
##
## Residual standard error: 2187 on 10 degrees of freedom
## Multiple R-squared: 0.04454, Adjusted R-squared: -0.051
## F-statistic: 0.4662 on 1 and 10 DF, p-value: 0.5103
Population size and population density are evaluated below as confounding variables for COVID-19 deaths. These variables are first analyzed on a full-world scale, and then by continent.
ggplot(data = owid.countries, aes(x = population, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
labs(title = ("World COVID-19 Deaths per Million People vs. Population Size")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$population, use = "complete.obs") ## [1] -0.01920185
pop.lm.deaths <- lm(totdeaths.mil ~ population, data = owid.countries)
pop.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
##
## Coefficients:
## (Intercept) population
## 6.344e+01 -1.960e-08
summary.lm(pop.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.12 -60.42 -54.86 -29.27 1174.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.344e+01 1.307e+01 4.853 2.8e-06 ***
## population -1.960e-08 7.946e-08 -0.247 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 162.4 on 165 degrees of freedom
## (19 observations deleted due to missingness)
## Multiple R-squared: 0.0003687, Adjusted R-squared: -0.00569
## F-statistic: 0.06086 on 1 and 165 DF, p-value: 0.8054
ggplot(data = owid.countries, aes(x = pop.density, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
labs(title = ("World COVID-19 Deaths per Million People vs. Population Density")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$pop.density, use = "complete.obs") ## [1] 0.006573727
pop.den.lm.deaths <- lm(totdeaths.mil ~ pop.density, data = owid.countries)
pop.den.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
##
## Coefficients:
## (Intercept) pop.density
## 6.345e+01 6.254e-04
summary.lm(pop.den.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.32 -60.86 -56.24 -29.86 1173.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.345e+01 1.308e+01 4.851 2.86e-06 ***
## pop.density 6.254e-04 7.475e-03 0.084 0.933
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 163.7 on 162 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 4.321e-05, Adjusted R-squared: -0.006129
## F-statistic: 0.007001 on 1 and 162 DF, p-value: 0.9334
deaths.pop <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = population, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Population Size"))
}
pop.lm.deaths.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totdeaths.mil ~ population,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.pop(i))
print(pop.lm.deaths.cont(i))
} #Loops function for continents of interest. ##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.650 -22.027 -15.337 8.501 155.684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.990e+01 1.203e+01 1.654 0.118
## population 8.979e-07 1.424e-07 6.306 1.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.09 on 16 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.7131, Adjusted R-squared: 0.6952
## F-statistic: 39.77 on 1 and 16 DF, p-value: 1.044e-05
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -295.08 -139.13 -109.91 22.27 1084.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.527e+02 4.546e+01 3.359 0.00165 **
## population 1.196e-06 1.384e-06 0.864 0.39225
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 262.7 on 43 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01707, Adjusted R-squared: -0.005785
## F-statistic: 0.7469 on 1 and 43 DF, p-value: 0.3923
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.333 -9.482 -6.405 -1.058 81.553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.171e+01 3.120e+00 3.753 0.000569 ***
## population -5.163e-09 9.759e-09 -0.529 0.599776
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.77 on 39 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.007125, Adjusted R-squared: -0.01833
## F-statistic: 0.2799 on 1 and 39 DF, p-value: 0.5998
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.050 -4.417 -2.140 1.130 49.289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.471e+00 1.514e+00 3.613 0.000734 ***
## population -2.837e-08 3.370e-08 -0.842 0.404077
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.608 on 47 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.01486, Adjusted R-squared: -0.006102
## F-statistic: 0.7089 on 1 and 47 DF, p-value: 0.4041
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.684e+00 NaN NaN NaN
## population -2.529e-08 NaN NaN NaN
##
## Residual standard error: NaN on 0 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 1 and 0 DF, p-value: NA
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -87.26 -77.23 -64.89 -32.83 538.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.471e+01 6.396e+01 1.168 0.270
## population 4.588e-07 9.662e-07 0.475 0.645
##
## Residual standard error: 186.2 on 10 degrees of freedom
## Multiple R-squared: 0.02206, Adjusted R-squared: -0.07574
## F-statistic: 0.2255 on 1 and 10 DF, p-value: 0.6451
deaths.pop.den <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = pop.density, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Population Density"))
}
pop.den.lm.cases.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totdeaths.mil ~ pop.density,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.pop.den(i))
print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest.##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.355 -45.177 -22.267 7.791 252.855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.2774 28.0891 2.822 0.0123 *
## pop.density -0.1761 0.1187 -1.483 0.1574
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 82.43 on 16 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.1209, Adjusted R-squared: 0.06593
## F-statistic: 2.2 on 1 and 16 DF, p-value: 0.1574
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.39 -148.77 -121.59 2.42 1064.84
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 173.722636 40.334921 4.307 9.42e-05 ***
## pop.density -0.001816 0.013921 -0.130 0.897
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 265 on 43 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0003954, Adjusted R-squared: -0.02285
## F-statistic: 0.01701 on 1 and 43 DF, p-value: 0.8968
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.516 -9.817 -6.638 -0.171 80.153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.754973 3.283937 3.884 0.00041 ***
## pop.density -0.001575 0.001856 -0.848 0.40173
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.99 on 37 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.01908, Adjusted R-squared: -0.007434
## F-statistic: 0.7196 on 1 and 37 DF, p-value: 0.4017
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.894 -4.129 -2.571 0.850 49.621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.501368 1.599068 2.815 0.00716 **
## pop.density 0.002969 0.009547 0.311 0.75723
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.738 on 46 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.002098, Adjusted R-squared: -0.0196
## F-statistic: 0.0967 on 1 and 46 DF, p-value: 0.7572
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.92739 NaN NaN NaN
## pop.density 0.03486 NaN NaN NaN
##
## Residual standard error: NaN on 0 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 1 and 0 DF, p-value: NA
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -118.40 -73.23 -38.69 -21.33 535.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.976 91.898 0.381 0.711
## pop.density 2.304 3.081 0.748 0.472
##
## Residual standard error: 183.2 on 10 degrees of freedom
## Multiple R-squared: 0.05295, Adjusted R-squared: -0.04175
## F-statistic: 0.5591 on 1 and 10 DF, p-value: 0.4718
First, a linear regression is performed on world COVID-19 cases and GDP. Then, scatter plots with a smooth line showing the relationship between GDP per capita and COVID-19 cases per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = gdp.cap, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("World COVID-19 Cases per Million People in vs. GDP per Capita")) #COVID-19 cases and GDP per capita on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totcases.mil, owid.countries$gdp.cap, use = "complete.obs") ## [1] 0.6567062
gdp.lm.cases <- lm(totcases.mil ~ gdp.cap, data = owid.countries)
gdp.lm.cases##
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
##
## Coefficients:
## (Intercept) gdp.cap
## -358.20891 0.08419
summary.lm(gdp.lm.cases) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5365.0 -832.3 -60.2 312.2 15342.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.582e+02 1.985e+02 -1.804 0.0729 .
## gdp.cap 8.419e-02 7.288e-03 11.552 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1911 on 176 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.4313, Adjusted R-squared: 0.428
## F-statistic: 133.5 on 1 and 176 DF, p-value: < 2.2e-16
cases.gdp <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = gdp.cap, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. GDP per Capita"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.gdp(i))
} #Loops function for continents of interest. First, a linear regression is performed on world COVID-19 deaths and GDP. Scatter plots with a smooth line showing the relationship between GDP per capita and COVID-19 deaths per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = gdp.cap, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("World COVID-19 Deaths per Million People in vs. GDP per Capita")) #COVID-19 deaths and GDP per capita on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$gdp.cap, use = "complete.obs") ## [1] 0.3552196
gdp.lm.deaths <- lm(totdeaths.mil ~ gdp.cap, data = owid.countries)
gdp.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
##
## Coefficients:
## (Intercept) gdp.cap
## 5.378126 0.002755
summary.lm(gdp.lm.deaths) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -314.35 -44.65 -18.93 -7.09 1075.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.378e+00 1.642e+01 0.328 0.744
## gdp.cap 2.755e-03 5.768e-04 4.777 4.04e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 148.3 on 158 degrees of freedom
## (26 observations deleted due to missingness)
## Multiple R-squared: 0.1262, Adjusted R-squared: 0.1207
## F-statistic: 22.82 on 1 and 158 DF, p-value: 4.042e-06
deaths.gdp <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = gdp.cap, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. GDP per Capita"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.gdp(i))
} #Loops function for continents of interest. First, scatter plot shows all countries and the relationship between median age and COVID-19 outcomes. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 cases per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.
ggplot(data = owid.countries, aes(x = med.age, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Median Age", y = "Total COVID-19 Cases per Million People") +
labs(title = "World COVID-19 Cases per Million People vs. Median Population Age") #COVID-19 cases vs. median age on world scale. cor(owid.countries$totcases.mil, owid.countries$med.age, use = "complete.obs")## [1] 0.326925
age.lm.cases <- lm(totcases.mil ~ med.age, data = owid.countries)
age.lm.cases##
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## -1161.02 74.91
summary.lm(age.lm.cases) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2317.4 -954.8 -304.1 25.1 18524.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1161.02 518.02 -2.241 0.0263 *
## med.age 74.91 16.32 4.589 8.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1998 on 176 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.1069, Adjusted R-squared: 0.1018
## F-statistic: 21.06 on 1 and 176 DF, p-value: 8.428e-06
exp.model.cases <- lm(log(totcases.mil) ~ med.age, data = owid.countries)
exp.model.cases##
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## 1.6876 0.1259
summary.lm(exp.model.cases)##
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6445 -1.0443 0.0571 1.1717 4.1880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.68764 0.43260 3.901 0.000136 ***
## med.age 0.12587 0.01363 9.234 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.668 on 176 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.3263, Adjusted R-squared: 0.3225
## F-statistic: 85.26 on 1 and 176 DF, p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.## med.age
## 1 30.37978
cases.med.age <- function(x) {
owid.countries %>%
mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
mutate(med.age <- as.factor(med.age)) %>%
filter(continent == x) %>%
ggplot(aes(x = location, y = totcases.mil, fill = factor(med.age))) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Cases per Million People by Country in ", (continent = x), " and Median Age"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.med.age(i))
} #Loops function for continents of interest.First, scatter plot shows all countries and the relationship between median age and COVID-19 deaths. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 daeths per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.
ggplot(data = owid.countries, aes(x = med.age, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Median Age", y = "Total COVID-19 Deaths per Million People") +
labs(title = "World COVID-19 Deaths per Million People vs. Median Population Age") #COVID-19 deaths vs. median age on world scale. cor(owid.countries$totdeaths.mil, owid.countries$med.age, use = "complete.obs")## [1] 0.368092
age.lm.deaths <- lm(totdeaths.mil ~ med.age, data = owid.countries)
age.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## -101.444 4.981
summary.lm(age.lm.deaths) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -131.55 -53.94 -18.72 10.14 710.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -101.4439 31.9989 -3.170 0.00183 **
## med.age 4.9813 0.9947 5.008 1.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 118.3 on 160 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.1355, Adjusted R-squared: 0.1301
## F-statistic: 25.08 on 1 and 160 DF, p-value: 1.442e-06
exp.model.deaths <- lm(log(totdeaths.mil) ~ med.age, data = owid.countries)
exp.model.deaths##
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## -2.1782 0.1375
summary.lm(exp.model.deaths)##
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8493 -1.0508 0.1557 1.0573 4.6191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.17815 0.44962 -4.844 2.98e-06 ***
## med.age 0.13752 0.01398 9.839 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.663 on 160 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.377, Adjusted R-squared: 0.3731
## F-statistic: 96.8 on 1 and 160 DF, p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.## med.age
## 1 30.37978
deaths.med.age <- function(x) {
owid.countries %>%
mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
mutate(med.age <- as.factor(med.age)) %>%
filter(continent == x) %>%
ggplot(aes(x = location, y = totdeaths.mil, fill = factor(med.age))) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Deaths per Million People by Country in ", (continent = x), " and Median Age"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.med.age(i))
} #Loops function for continents of interest.Graphs below show percentages of female and male smokers by country and the correlation to COVID-19 cases.
ggplot(data = owid.countries, aes(x = fem.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Cases per Million People") +
labs(title = "World COVID-19 Cases per Million People vs. Percentage of Female Smokers") #COVID-19 cases vs. female smokers on world scale. cor(owid.countries$totcases.mil, owid.countries$fem.smokers, use = "complete.obs") ## [1] 0.2013998
ggplot(data = owid.countries, aes(x = male.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Cases per Million People") +
labs(title = "World COVID-19 Cases per Million People vs. Percentage of Male Smokers") #COVID-19 cases vs. male smokers on world scale. cor(owid.countries$totcases.mil, owid.countries$male.smokers, use = "complete.obs") ## [1] -0.06492578
cases.smoking.f <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = fem.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.smoking.f(i)) #Apply function to each continent.
}cases.smoking.m <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = male.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.smoking.m(i)) #Apply function to each continent.
}Graphs below show percentages of female and male smokers in the world and by country and the correlation to COVID-19 deaths.
ggplot(data = owid.countries, aes(x = fem.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Female Smokers") #COVID-19 deaths vs. female smokers on world scale. cor(owid.countries$totdeaths.mil, owid.countries$fem.smokers, use = "complete.obs") ## [1] 0.369769
ggplot(data = owid.countries, aes(x = male.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Male Smokers") #COVID-19 deaths vs. male smokers on world scale. cor(owid.countries$totdeaths.mil, owid.countries$male.smokers, use = "complete.obs") ## [1] -0.09632521
deaths.smoking.f <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = fem.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.smoking.f(i)) #Apply function to each continent.
}deaths.smoking.m <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = male.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.smoking.m(i)) #Apply function to each continent.
}#Smoking likely not significant: when adjusting for GDP, statistical significance actually goes down. Same occurs when applied code to total deaths (code not shown).
summary(lm(totcases.mil ~ fem.smokers + gdp.cap, data = owid.countries)) #Female smokers with and without ajdusting for GDP. ##
## Call:
## lm(formula = totcases.mil ~ fem.smokers + gdp.cap, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5439.7 -775.0 -170.8 266.3 10198.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -190.31965 219.81352 -0.866 0.388
## fem.smokers -21.23930 14.34715 -1.480 0.141
## gdp.cap 0.08348 0.00719 11.611 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1637 on 135 degrees of freedom
## (48 observations deleted due to missingness)
## Multiple R-squared: 0.5139, Adjusted R-squared: 0.5067
## F-statistic: 71.35 on 2 and 135 DF, p-value: < 2.2e-16
summary(lm(totcases.mil ~ fem.smokers, data = owid.countries))##
## Call:
## lm(formula = totcases.mil ~ fem.smokers, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2456.1 -1111.7 -872.8 341.9 18794.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 921.37 284.94 3.234 0.00153 **
## fem.smokers 46.61 19.29 2.415 0.01703 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2384 on 138 degrees of freedom
## (46 observations deleted due to missingness)
## Multiple R-squared: 0.04056, Adjusted R-squared: 0.03361
## F-statistic: 5.834 on 1 and 138 DF, p-value: 0.01703
summary(lm(totcases.mil ~ male.smokers + gdp.cap, data = owid.countries)) #Male smokers with and without adjusting for GDP. ##
## Call:
## lm(formula = totcases.mil ~ male.smokers + gdp.cap, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5076.1 -766.7 -47.3 311.9 10718.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.141e+02 3.966e+02 -0.792 0.430
## male.smokers -2.003e+00 1.021e+01 -0.196 0.845
## gdp.cap 8.041e-02 6.692e-03 12.016 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1618 on 133 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.5231, Adjusted R-squared: 0.516
## F-statistic: 72.95 on 2 and 133 DF, p-value: < 2.2e-16
summary(lm(totcases.mil ~ male.smokers, data = owid.countries))##
## Call:
## lm(formula = totcases.mil ~ male.smokers, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1655.0 -1295.8 -953.0 571.4 18299.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1762.69 534.72 3.296 0.00125 **
## male.smokers -11.47 15.12 -0.759 0.44931
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2426 on 136 degrees of freedom
## (48 observations deleted due to missingness)
## Multiple R-squared: 0.004215, Adjusted R-squared: -0.003107
## F-statistic: 0.5757 on 1 and 136 DF, p-value: 0.4493
First, a linear regression is performed on world COVID-19 cases and diabetes prevalence. Then, scatter plots with a smooth line showing the relationship of diabetes prevalence and COVID-19 cases per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = diab.prev, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Diabetes Prevalence", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("World COVID-19 Cases per Million People in vs. Diabetes Prevalence")) #COVID-19 cases and diabetes prevalence on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totcases.mil, owid.countries$diab.prev, use = "complete.obs") ## [1] 0.1093217
diab.lm.cases <- lm(totcases.mil ~ diab.prev, data = owid.countries)
diab.lm.cases##
## Call:
## lm(formula = totcases.mil ~ diab.prev, data = owid.countries)
##
## Coefficients:
## (Intercept) diab.prev
## 711.48 74.21
summary.lm(diab.lm.cases) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totcases.mil ~ diab.prev, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2082.2 -1148.3 -879.2 259.4 18641.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 711.48 430.99 1.651 0.101
## diab.prev 74.21 50.29 1.476 0.142
##
## Residual standard error: 2576 on 180 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.01195, Adjusted R-squared: 0.006462
## F-statistic: 2.177 on 1 and 180 DF, p-value: 0.1418
cases.diab <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = diab.prev, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Diabetes Prevalence", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Diabetes Prevalence"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.diab(i))
} #Loops function for continents of interest. First, a linear regression is performed on world COVID-19 deaths and diabetes prevalence. Scatter plots with a smooth line showing the relationship of diabetes prevalence and COVID-19 deaths per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = diab.prev, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Diabetes Prevalence", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("World COVID-19 Deaths per Million People in vs. Diabetes Prevalence")) #COVID-19 deaths and diabetes prevalence on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$diab.prev, use = "complete.obs") ## [1] -0.1266164
diab.lm.deaths <- lm(totdeaths.mil ~ diab.prev, data = owid.countries)
diab.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ diab.prev, data = owid.countries)
##
## Coefficients:
## (Intercept) diab.prev
## 105.516 -5.521
summary.lm(diab.lm.deaths) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totdeaths.mil ~ diab.prev, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.80 -65.23 -46.68 -6.19 1163.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.516 28.750 3.670 0.000329 ***
## diab.prev -5.521 3.398 -1.625 0.106183
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 162.4 on 162 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.01603, Adjusted R-squared: 0.009958
## F-statistic: 2.639 on 1 and 162 DF, p-value: 0.1062
deaths.diab <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = diab.prev, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Diabetes Prevalence", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Diabetes Prevalence"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.diab(i))
} #Loops function for continents of interest. First, a linear regression is performed on world COVID-19 cases and cardiovascular death. Then, scatter plots with a smooth line showing the relationship between cardiovascular death and COVID-19 cases per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = card.death, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Cardiovascular Death", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("World COVID-19 Cases per Million People in vs. Cardiovascular Death")) #COVID-19 cases and cardiovascular death on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totcases.mil, owid.countries$card.death, use = "complete.obs") ## [1] -0.3278695
card.lm.cases <- lm(totcases.mil ~ card.death, data = owid.countries)
card.lm.cases##
## Call:
## lm(formula = totcases.mil ~ card.death, data = owid.countries)
##
## Coefficients:
## (Intercept) card.death
## 2750.669 -6.151
summary.lm(card.lm.cases) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totcases.mil ~ card.death, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2130.0 -1126.2 -637.7 374.0 18089.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2750.669 377.291 7.291 9.89e-12 ***
## card.death -6.151 1.332 -4.617 7.45e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2086 on 177 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.1075, Adjusted R-squared: 0.1025
## F-statistic: 21.32 on 1 and 177 DF, p-value: 7.452e-06
cases.card <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = card.death, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Cardiovascular Death", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Cardiovascular Death"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.card(i))
} #Loops function for continents of interest. First, a linear regression is performed on world COVID-19 deaths and cardiovascular death rates from 2017. Scatter plots with a smooth line showing the relationship between cardiovascular death rates and COVID-19 deaths per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = card.death, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Cardiovascular Death", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("World COVID-19 Deaths per Million People in vs. Cardiovascular Death")) #COVID-19 deaths and cardiovascular death on world scale. Red line is smooth fit; purple line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$card.death, use = "complete.obs") ## [1] -0.3752894
card.lm.deaths <- lm(totdeaths.mil ~ card.death, data = owid.countries)
card.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ card.death, data = owid.countries)
##
## Coefficients:
## (Intercept) card.death
## 164.9514 -0.4312
summary.lm(card.lm.deaths) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totdeaths.mil ~ card.death, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -123.63 -66.95 -36.31 22.85 701.45
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164.9514 23.4724 7.027 5.73e-11 ***
## card.death -0.4312 0.0842 -5.121 8.62e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 126 on 160 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.1408, Adjusted R-squared: 0.1355
## F-statistic: 26.23 on 1 and 160 DF, p-value: 8.622e-07
deaths.card <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = card.death, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "purple") +
theme_bw() +
labs(x = "Cardiovascular Death", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Cardiovascular Death"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.card(i))
} #Loops function for continents of interest. World map file is read in and prepared for overlaying with COVID-19 data.
library(sf)
#File downloaded from https://hub.arcgis.com/datasets/esri::world-countries-generalized-1/explore and moved to working directory. File was unzipped and folder renamed to "world_maps".
world.sf <- st_read("world_maps/country_gen_trim.shp") # Read shapefile as an sf object## Reading layer `country_gen_trim' from data source `/Users/alanaschreibman/COVID-19_NO2/world_maps/country_gen_trim.shp' using driver `ESRI Shapefile'
## Simple feature collection with 256 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -89 xmax: 180 ymax: 83.6236
## Geodetic CRS: WGS 84
class(world.sf)## [1] "sf" "data.frame"
head(world.sf)## Simple feature collection with 6 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -176.6431 ymin: -21.94084 xmax: -138.8098 ymax: 0.808609
## Geodetic CRS: WGS 84
## OBJECTID FIPS_CNTRY ISO_2DIGIT ISO_3DIGIT NAME COUNTRYAFF
## 1 1 AQ AS ASM American Samoa United States
## 2 2 WQ UM UMI Baker United States
## 3 3 CW CK COK Cook Islands New Zealand
## 4 4 FP PF PYF French Polynesia France
## 5 5 WQ UM UMI Howland United States
## 6 6 WQ UM UMI Jarvis United States
## CONTINENT SHAPE_Leng SHAPE_Area geometry
## 1 Oceania 0.60012436 1.371972e-02 MULTIPOLYGON (((-170.7439 -...
## 2 Oceania 0.02887532 3.430265e-05 MULTIPOLYGON (((-176.4614 0...
## 3 Oceania 0.98066416 1.307346e-02 MULTIPOLYGON (((-159.747 -2...
## 4 Oceania 3.93021062 1.753321e-01 MULTIPOLYGON (((-149.1792 -...
## 5 Oceania 0.05303007 1.811934e-04 MULTIPOLYGON (((-176.6362 0...
## 6 Oceania 0.09758213 6.658148e-04 MULTIPOLYGON (((-160.0211 -...
world.geo <- st_geometry(world.sf)
plot(world.geo)plot(world.geo, col = "lemonchiffon2")plot(world.geo, lwd = 0.5, border = "red")Joins OWID data set with world map data frame and then use country cases to shade countries accordingly.
library(RColorBrewer)
library(viridis)
my_theme <- function() {
theme_minimal() +
theme(axis.line = element_blank(),
plot.title = element_text(size=30),
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_line(color = "white"),
legend.key.size = unit(0.8, "cm"),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20))
}
myPalette <- colorRampPalette(brewer.pal(9, "BuPu"))
world.sf[105, "NAME"] <- "Democratic Republic of Congo"
world.sf <- world.sf %>%
rename(location = NAME) %>%
inner_join(owid.countries, world.sf, by = "location") #Joined OWID data set with .sph file for geospatial analysis.
ggplot(data = world.sf, aes(fill = totcases.mil),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Population-Adjusted Distribution of COVID-19 Cases") +
scale_fill_gradientn(name = "COVID-19 \ncases (per million)",
limits = c(0, 6500),
colours = myPalette(100)) #Plot of world distribution of COVID-19 cases with color shading. Uses merged data frame and shades deaths by country accordingly.
ggplot(data = world.sf, aes(fill = totdeaths.mil),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Population-Adjusted Distribution of COVID-19 Deaths") +
scale_fill_gradientn(name = "COVID-19 \ndeaths (per million)",
limits = c(0, 700),
colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. Uses merged data frame and shades GDP per capita by country accordingly.
ggplot(data = world.sf, aes(fill = gdp.cap),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Global Distribution of GDP per Capita") +
scale_fill_gradientn(name = "GDP \nper capita",
limits = c(0, 60000),
colours = myPalette(100)) Uses merged data frame and shades population density by country accordingly.
ggplot(data = world.sf, aes(fill = pop.density),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Global Distribution of Population Density") +
scale_fill_gradientn(name = "Population \ndensity",
limits = c(0, 450),
colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. Uses merged data frame and shades median age by country accordingly.
ggplot(data = world.sf, aes(fill = med.age),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Global Distribution of Median Age") +
scale_fill_gradientn(name = "Median \n age",
limits = c(0, 50),
colours = myPalette(100)) Uses merged data frame and shades male and female smoking percentages by country accordingly.
ggplot(data = world.sf, aes(fill = fem.smokers),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Percentage of Female Smokers by Country") +
scale_fill_gradientn(name = "Female \nsmokers (%)",
limits = c(0, 40),
colours = myPalette(100)) ggplot(data = world.sf, aes(fill = male.smokers),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Percentage of Male Smokers by Country") +
scale_fill_gradientn(name = "Male \nsmokers (%)",
limits = c(0, 80),
colours = myPalette(100)) Uses merged data frame and shades diabetes prevalence by country accordingly.
ggplot(data = world.sf, aes(fill = diab.prev),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Diabetes Prevalence by Country") +
scale_fill_gradientn(name = "Diabetes \nprevalence (%)",
limits = c(0, 20),
colours = myPalette(100)) Uses merged data frame and shades indicence of cardiovascular death by country accordingly.
ggplot(data = world.sf, aes(fill = card.death),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Incidence of Cardiovascular Death by Country") +
scale_fill_gradientn(name = "Diabetes \nprevalence (%)",
limits = c(0, 800),
colours = myPalette(100))